require(quanteda)
require(stm)
require(mvtnorm)
require(dichromat)
Paired <- colorschemes$Categorical.12

#### covert the data to stm format ####
load("dfm_matrix.Rdata")

## extract the data of successful candidates
dfm.matrix.winners <- dfm_subset(dfm.matrix, result == 1)
dfm.matrix.winners <- dfm_trim(dfm.matrix.winners, 
                               min_termfreq = 1, 
                               termfreq_type = "count")  # delete unused terms
dim(dfm.matrix.winners)  # number of successful candidates
table(docvars(dfm.matrix.winners, "female"))  # number of successful women candidates

stm.data.winners <- convert(dfm.matrix.winners, to = "stm")
stm.data.winners$meta$year <- factor(stm.data.winners$meta$year)
stm.data.winners$meta$party <- factor(stm.data.winners$meta$party, 
                                      levels = c("LDP", "JSP", "Komeito", "DSP", "JCP", "NFP", "DPJ", "left", "right", "others"))

#### specify the number of topics ####
## from 10- to 90-topic models
number.of.topics.winners.1 <- seq(10, 90, 10)

for (i in 1:length(number.of.topics.winners.1)) {
  set.seed(number.of.topics.winners.1[i])
  STM.result.winners <- stm(documents = stm.data.winners$documents, 
                            vocab = stm.data.winners$vocab, 
                            K = number.of.topics.winners.1[i], 
                            prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                            data = stm.data.winners$meta)
  save(STM.result.winners, file = paste0("STM_result_winners_", number.of.topics.winners.1[i], ".Rdata"))
}

exclusivity.res.winners.1 <- SC.res.winners.1 <- rep(NA, length(number.of.topics.winners.1))
for (i in 1:length(number.of.topics.winners.1)) {
  load(paste0("STM_result_winners_", number.of.topics.winners.1[i], ".Rdata"))
  exclusivity.res.winners.1[i] <- mean(exclusivity(STM.result.winners))
  SC.res.winners.1[i] <- mean(semanticCoherence(STM.result.winners, stm.data.winners$documents))
}

# Figure A.8
cairo_pdf("Figure_A8.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.winners.1)), SC.res.winners.1, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.winners.1), 2), 
     labels = number.of.topics.winners.1[seq(1, length(number.of.topics.winners.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.winners.1)), exclusivity.res.winners.1, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.winners.1), 2), 
     labels = number.of.topics.winners.1[seq(1, length(number.of.topics.winners.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

## from 40- to 60-topic models
number.of.topics.winners.2 <- c(41:49, 51:59)

for (i in 1:length(number.of.topics.winners.2)) {
  set.seed(number.of.topics.winners.2[i])
  STM.result.winners <- stm(documents = stm.data.winners$documents, 
                            vocab = stm.data.winners$vocab, 
                            K = number.of.topics.winners.2[i], 
                            prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                            data = stm.data.winners$meta)
  save(STM.result.winners, file = paste0("STM_result_winners_", number.of.topics.winners.2[i], ".Rdata"))
}

number.of.topics.winners.3 <- 40:60

exclusivity.res.winners.2 <- SC.res.winners.2 <- rep(NA, length(number.of.topics.winners.3))
for (i in 1:length(number.of.topics.winners.3)) {
  load(paste0("STM_result_winners_", number.of.topics.winners.3[i], ".Rdata"))
  exclusivity.res.winners.2[i] <- mean(exclusivity(STM.result.winners))
  SC.res.winners.2[i] <- mean(semanticCoherence(STM.result.winners, stm.data.winners$documents))
}

# Figure A.9
cairo_pdf("Figure_A9.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.winners.3)), SC.res.winners.2, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.winners.3), 2), 
     labels = number.of.topics.winners.3[seq(1, length(number.of.topics.winners.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.winners.3)), exclusivity.res.winners.2, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.winners.3), 2), 
     labels = number.of.topics.winners.3[seq(1, length(number.of.topics.winners.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

#### list of topics (Online Appendix F.2) ####
load("STM_result_winners_48.Rdata")

topic.order.winners <- order(colMeans(STM.result.winners$theta), decreasing = TRUE)
data.frame(labelTopics(STM.result.winners, n = 5)$prob, 
           prob = round(colMeans(STM.result.winners$theta), 3) * 100)[topic.order.winners, ]

topic.labels.winners <- c("Candidate information", "Decentralization", "Political mistrust", "Change of government", 
                          "Public security", "Budget", "Political reform", "Legislative records", "Administrative efficiency", 
                          "Child care", "Issues in the 2000 election", "Economic measures", "Appeals for honesty", 
                          "Abolition of the consumption tax", "Postal privatization", "World affairs", 
                          "Traffic network development", "Abstract principles", "DPJ manifesto (2003)", 
                          "National problems in general", "DPJ manifesto (2009)", "JCP manifesto (1996-2009)", 
                          "Appeals for novelty", "Civil society", "Okinawan politics", "Pacifism", "Public pension", 
                          "Infrastructure", "Environmental issues", "Social welfare", "Criticism of the government (1993)", 
                          "Railways", "Gratitude for support", "Taxation in general", "Policies of the Nakasone government", 
                          "Policy support for mothers", "Support for rural areas", "Agriculture and fisheries", 
                          "Appeals for their firm beliefs", "Administrative reform", "JCP manifesto (1986-90)", 
                          "Local development", "Community development", "DPJ manifesto (2005)", "Appeals of their efforts", 
                          "Science and technology", "Appeals for change", "Local topics")
colnames(STM.result.winners$theta) <- topic.labels.winners

#### post-estimation simulation ####
## conduct the post-estimation simulation
set.seed(12345)
post.sim.winners <- estimateEffect(formula(1:48 ~ female + year + party + age + I(age ^ 2) + incumbent + wins), 
                                   STM.result.winners, stm.data.winners$meta, nsims = 100)
summary.post.sim.winners <- summary(post.sim.winners)

## which topics show a large gender difference
gender.diff.winners <- matrix(NA, 48, 4)
for (i in 1:48) {
  gender.diff.winners[i, ] <- summary.post.sim.winners$tables[[i]][2, ]  # the second row corresponds to the coefficient of the female dummy
}
rownames(gender.diff.winners) <- topic.labels.winners
significant.topics.winners <- p.adjust(gender.diff.winners[, 4], "holm") < 0.05
round(gender.diff.winners[significant.topics.winners, ], 3)  # topics whose gender difference is significant with the Holm correction
topic.labels.winners[significant.topics.winners]
gender.diff.winners.order <- order(gender.diff.winners[, 1], decreasing = TRUE)

# top 10 most frequent words in "policy support for mothers" (Online Appendix footnote 1)
labelTopics(STM.result.winners, n = 10)$prob[36, ]

# top 10 most frequent words in "social welfare" (Online Appendix footnote 2)
labelTopics(STM.result.winners, n = 10)$prob[30, ]

# top 10 most frequent words in "pacifism" (Online Appendix footnote 2)
labelTopics(STM.result.winners, n = 10)$prob[26, ]

#### compute the first-differences (FDs) ####
## function implementing the procedure explained in Online Appendix A.3
predict.fun.model1 <- function(result, covname, cov.value1, cov.value2, sim.beta) {
  cdata <- result$data
  cdata[, covname] <- cov.value1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.1 <- tcrossprod(design.matrix, sim.beta)
  cdata[, covname] <- cov.value2
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.2 <- tcrossprod(design.matrix, sim.beta)
  dif.sim.result <- sim.result.1 - sim.result.2
  dif.sim.mean <- mean(dif.sim.result)
  dif.sim.CI <- quantile(colMeans(dif.sim.result), c(0.025, 0.975))
  c(dif.sim.mean,  # point estimate of FD
    dif.sim.CI,  # confidence interval
    mean(sim.result.1),  # expected prevalence when covname takes cov.value1
    mean(sim.result.2))  # expected prevalence when covname takes cov.value2
}

## compute the FDs for gender
female.sim.result.winners <- matrix(NA, 48, 5)
for (i in 1:48) {
  set.seed(12345)
  sim.beta <- do.call(rbind, 
                      lapply(post.sim.winners$parameters[[i]], 
                             function(x) rmvnorm(100, x$est, x$vcov)))
  female.sim.result.winners[i, ] <- predict.fun.model1(post.sim.winners, "female", 1, 0, sim.beta)
}

## how large the differences in female-stereotypic topics are
rownames(female.sim.result.winners) <- topic.labels.winners
round(female.sim.result.winners[c(36, 30, 26), ], 3)
round(female.sim.result.winners[c(36, 30, 26), 4] / 
        female.sim.result.winners[c(36, 30, 26), 5], 2)

# Figure A.10
cairo_pdf("Figure_A10.pdf", 
          width = 6, height = 5, pointsize = 7, family = "Helvetica")
par(mar = c(4, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.055, 0.042), ylim = c(1, 48), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.03, 0.04, 0.01), col = "gray")
segments(rep(-0.032, 48), 1:48, 
         rep(0.042, 48), 1:48, lty = 3, col = "gray")
points(female.sim.result.winners[gender.diff.winners.order, 1], 48:1, 
       pch = ifelse(significant.topics.winners[gender.diff.winners.order], 15, 19), 
       col = ifelse(significant.topics.winners[gender.diff.winners.order], Paired[12], "black"))
segments(female.sim.result.winners[gender.diff.winners.order, 2], 48:1, 
         female.sim.result.winners[gender.diff.winners.order, 3], 48:1, 
         lwd = 0.5, col = ifelse(significant.topics.winners[gender.diff.winners.order], Paired[12], "black"))
text(rep(-0.032, 48), 48:1, 
     labels = topic.labels.winners[gender.diff.winners.order], pos = 2, cex = 0.8)
axis(1, at = c(seq(-0.03, 0.04, 0.01)), lwd = 0.5)
mtext(c("more prevalent\namong men", "more prevalent\namong women"), 
      side = 1, at = c(-0.03, 0.04), line = 3)
dev.off()